Airbnb is an online marketplace that connects people who want to rent out their homes with people who are looking for accommodations in that locale. It currently covers more than 81,000 cities and 191 countries worldwide. The company's name comes from “air mattress B&B.”
For hosts, participating in Airbnb is a way to earn some income from their property, but with the risk that the guest might do damage to it. For guests, the advantage can be relatively inexpensive accommodations, but with the risk that the property won’t be as appealing as the listing made it seem.
Airbnb: Advantages and Disadvantages
Confirm quality between neighbourhood and prices using a heatmap.
Positive and negative reviews per neighbourhood.
Average room type price by neighbourhoods.
Mapping Airbnb location.
| Name | User github |
|---|---|
| Edgar | EddOselotl |
| Geovanny | carlosgeovany |
| Caro | CaroAcostaT |
Clean data set can be found in the project repository here.
The next import was a suggestion from this question in Github about geopandas not making it work
import os
os.environ["PROJ_LIB"] = "C:\Anaconda\envs\env_name\Library\share" #windows
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import geopandas
import branca
import pyproj
from geopandas import GeoDataFrame
from folium.features import GeoJson, GeoJsonTooltip
from scipy.stats import norm
from scipy import stats
Using pandas we can easily read our dataset
df = pd.read_csv("https://raw.githubusercontent.com/prope-2020-gh-classroom/practica-final-por-equipos-verano-2020-itam-EddOselotl/master/airbnb_clean.csv")
df.head()
df["price"].describe()
# we might want to know what the names of the columns are
df.columns
print(len(df))
df.info()
df.isnull().sum()
print("\nIn Mexico City, the following neighborhoods have Airbnb rooms:\n")
for neighborhood in df.neighbourhood_cleansed.unique():
print("- {}".format(neighborhood))
Observe how many Airbnb rooms are per neighborhood
df.neighbourhood_cleansed.value_counts()
df_stat =(df.groupby("neighbourhood_cleansed").agg(frequency=("neighbourhood_cleansed", "count"),
mean_price =("price","mean"),
median_price =("price","median"),
min_price=("price","min"),
max_price=("price","max")).sort_values(by=["frequency"],ascending=False))
df_stat
corr = df_stat.corr()
plt.figure(figsize=(15,8))
sns.heatmap(corr, annot=True)
df_stat.columns
plSe = sns.countplot(df["neighbourhood_cleansed"], palette="plasma")
fig = plt.gcf()
fig.set_size_inches(20,10)
plt.title('Neighbourhood')
plSe.set_xticklabels(plSe.get_xticklabels(), rotation=45)
plt.show()
As we can see, the neighborhood with more Airbnb rooms is Cuauhtémoc
plt.figure(figsize=(15,10))
sns.distplot(df["price"], fit= norm)
plt.title("Price Distribution",size=15, weight='bold')
plt.show()
plt.figure(figsize=(15,10))
stats.probplot(df["price"], plot=plt)
plt.show()
df['price_log'] = np.log(df.price)
plt.figure(figsize=(15,10))
sns.distplot(df['price_log'], fit=norm)
plt.title("Log-Price Distribution",size=15, weight='bold')
plt.show()
plt.figure(figsize=(15,10))
stats.probplot(df["price_log"], plot=plt)
plt.show()
plt.figure(figsize=(20,20))
sns.scatterplot(df.longitude,df.latitude,hue=df.neighbourhood_cleansed)
plt.ioff()
plt.figure(figsize=(20,20))
sns.scatterplot(df.longitude,df.latitude,hue=df.price)
plt.ioff()
mean_neighborhoods = df.groupby(["neighbourhood_cleansed"])["review_scores_rating"].mean()
print(mean_neighborhoods.to_string())
Cuau_df = df[df.neighbourhood_cleansed == "Cuauhtémoc"]["review_scores_rating"]
Cuau_nan = Cuau_df[~np.isnan(Cuau_df)]
sns.distplot(Cuau_nan)
plt.title("Cuauhtémoc - score distribution")
plt.show()
# Plot for the rest of the beighborhoods
sns.set(style="white", palette="colorblind", color_codes=False)
fig, axs = plt.subplots(4, 4, sharex=True, sharey=True,figsize=(15, 10))
for axs,neighborhood in zip(axs.ravel(),df.neighbourhood_cleansed.unique()):
# Get the new df frame, only neighborhood with review scores
df_score = df[df.neighbourhood_cleansed == neighborhood]["review_scores_rating"]
nan = df_score[~np.isnan(df_score)] # Remove de NaNs
sns.distplot(nan, ax=axs, kde_kws={"shade": True})
axs.set_title(neighborhood)
axs.set(xlabel="",xlim=(0,120))
plt.tight_layout()
sns.despine(left=True)
Seems the worst neighborhood rated is Milpa Alta, but we can't jump into that conclusion since the scores weren't balanced.
# Get the neighborhood with max score
best_neighborhood = mean_neighborhoods.idxmax()
max_score = round(max(mean_neighborhoods),2)
print("\n{} got the highest rate with {} out of 100 points. This takes into consideration the NaNs.\n".format(best_neighborhood,max_score))
# Get the neighborhood with min score
worst_neighborhood = mean_neighborhoods.idxmin()
min_score = round(min(mean_neighborhoods),2)
print("\n{} got the lowest rate with {} out of 100 points. This takes into consideration the NaNs.\n".format(worst_neighborhood,min_score))
# Count how many left a review score, since we have seen there are NaN scores
number_of_reviews = df["review_scores_rating"].count()
reviews = round(number_of_reviews/len(df)*100,2)
print("\n{}% of the people left a review score\n".format(reviews))
# Now see the distribution of the bad reviews vs good reviews by neighborhood
# Let's say <90 is a bad review
bad_review = 90
bad_df = df[df["review_scores_rating"] < bad_review].groupby("neighbourhood_cleansed")["review_scores_rating"].count()
#print(bad_df.to_string())
good_df = df[df["review_scores_rating"] >= bad_review].groupby("neighbourhood_cleansed")["review_scores_rating"].count()
#print(good_df)
df_bad_good = pd.DataFrame()
df_bad_good["bad_reviews"] = bad_df
df_bad_good["good_reviews"] = good_df
df_bad_good["bad_review_per"] = round(bad_df / (bad_df + good_df) ,2)
df_bad_good["good_review_per"] = round(good_df / (bad_df + good_df),2)
df_bad_good
df_bad_good["neighborhood"] = df_bad_good.index
fig,ax = plt.subplots()
ax.bar(df_bad_good.neighborhood,df_bad_good.bad_reviews,0.5,label="bad reviews",color="r")
ax.bar(df_bad_good.neighborhood,df_bad_good.good_reviews,0.5,label="good reviews", color="g", bottom=df_bad_good.bad_reviews)
ax.legend()
plt.xticks(rotation=90)
plt.show()
As we can see this might not be the best plot to display the bad reviews vs good reviews. So now let's try with the percentagee instead of the score points
fig,ax = plt.subplots()
ax.bar(df_bad_good.neighborhood,df_bad_good.bad_review_per,0.5,label="bad reviews",color="r")
ax.bar(df_bad_good.neighborhood,df_bad_good.good_review_per,0.5,label="good reviews", color="g", bottom=df_bad_good.bad_review_per)
ax.legend()
plt.xticks(rotation=90)
plt.show()
With this we can see that the probability of having a good room based on the reviews is high (above 80%) except for Milpa Alta.
After doing the relation between bad a good reviews, we can see what neighborhood is the best or worst rated.
best_rate = df_bad_good.good_review_per.max()
best_rated_neighborhood = df_bad_good[df_bad_good.good_review_per == best_rate]
print("The best rated (with {}%) neighborhoods are:".format(best_rate*100))
for neighborhood in best_rated_neighborhood.index:
print(" - {}".format(neighborhood))
worst_rate = df_bad_good.bad_review_per.max()
worst_rated_neighborhood = df_bad_good[df_bad_good.bad_review_per == worst_rate]
print("The worst rated (with {}%) neighborhoods are:".format(worst_rate*100))
for neighborhood in worst_rated_neighborhood.index:
print(" - {}".format(neighborhood))
We present the top Airbnb accommodations in Mxico City based only in the score rating received.
We only plot those accommodations with 99 or 100 score rating on a $0-100$ basis
top_scores = df.loc[df['review_scores_rating'] > 98]
f = folium.Figure(width=30, height=50)
m = folium.Map(location = [19.451054, -99.125519], default_zoom_start = 5)
folium.GeoJson(
gdf,
style_function=lambda feature: {
'fillColor': 'gery',
'color' : 'black',
'weight' : 1,
'fillOpacity' : 0.3,
}
).add_to(m)
for index, row in top_scores.iterrows():
tooltip = 'City Hall: {}<br>Room Type: {}<br>Square Feet: {}<br>Price: ${}mxp'.format(row['neighbourhood_cleansed'],row["room_type"],row["square_feet"],row['price'])
folium.Circle(
location=[row["latitude"],row["longitude"]],
tooltip = tooltip,
radius = 10,
fill = True,
).add_to(m)
m
Our main goal will be to find the average price of an Airbnb accommodation offered by room type among the 16 City Halls in Mexico city and visualize this data on an interactive map.
All accommodation offered in Airbnb is required to be as one of these room types:
By calculating the average price for one of each of this room types by City Hall, we will be able to find the expensive and cheapper City Hall to get an Airbnb accommodation by room type.
Additionally, we will plot a map with the top accommodations by price.
The results are displayed in seaborn plots and interactive maps using folium to plot geospacial data.
geojson = "https://raw.githubusercontent.com/prope-2020-gh-classroom/practica-final-por-equipos-verano-2020-itam-EddOselotl/master/neighbourhoods.geojson"
### converting the file to geopandas object for mapping
geop = geopandas.read_file(geojson)
desc = df.groupby(['neighbourhood_cleansed','room_type'])['price'].mean().unstack(1)
### group by saved to a new dataframe for plotting
room_type = df.groupby(['room_type','neighbourhood_cleansed'])['price'].mean().reset_index(name ='mean_by_room')
### swarm plot
sns.catplot(x="room_type", y="mean_by_room", hue="neighbourhood_cleansed", kind="swarm", data=room_type).set_xticklabels(rotation=45);
### bar plot
sns.catplot(x="room_type", y="mean_by_room", hue="neighbourhood_cleansed", kind="bar", data=room_type).set_xticklabels(rotation=45);
Next step will be to merge into one data frame mean prices by room type and the accommodations available per category by City Hall and map the data.
count = df.groupby(['neighbourhood_cleansed','room_type'])['id'].count().unstack(1)
desc.reset_index(level=0, inplace=True)
count.reset_index(level=0, inplace=True)
merged = pd.merge(left=desc, right=count, on='neighbourhood_cleansed')
merged.rename(columns = {'neighbourhood_cleansed':'neighbourhood','Entire home/apt_x':'Entire home/apt mean','Hotel room_x':'Hotel room mean',
'Private room_x':'Private room mean','Shared room_x':'Shared room mean','Entire home/apt_y':'Entire home/apt count',
'Hotel room_y':'Hotel room count','Private room_y':'Private room count','Shared room_y':'Shared room count'}, inplace = True)
We create one map for each one of the room types:
#### Entire home/apt map
home_apt = merged[['neighbourhood','Entire home/apt mean', 'Entire home/apt count']]
home_apt = pd.merge(left = home_apt, right = geop, on = 'neighbourhood')
home_apt.drop(home_apt.columns[[3]], axis=1, inplace=True)
gdf = GeoDataFrame(home_apt, crs = "EPSG:4326", geometry = home_apt.geometry)
colormap = branca.colormap.LinearColormap(
vmin = gdf['Entire home/apt mean'].quantile(0.0),
vmax = gdf['Entire home/apt mean'].quantile(1.0),
colors = ['darkgreen','green','lightblue','blue','yellow','orange','red'],
caption = "Airbnb Entire home/apt mean price by City Hall",
)
m = folium.Map(location = [19.451054, -99.125519],
default_zoom_start = 5,
tiles="Cartodb Positron")
tooltip = GeoJsonTooltip(
fields = ["neighbourhood","Entire home/apt mean", "Entire home/apt count"],
aliases = ["City Hall:", "Mean Price:", "Total Accommodations:"],
localize = True,
sticky = False,
labels = True,
style = """
background-color: #F0EFEF;
border: 2px solid black;
border-radius: 3px;
box-shadow: 3px;
""",
max_width=800,
)
g = folium.GeoJson(
gdf,
style_function=lambda x: {
"fillColor": colormap(x["properties"]["Entire home/apt mean"])
if x["properties"]["Entire home/apt mean"] is not None
else "transparent",
"color": "black",
"fillOpacity": 0.4,
},
tooltip = tooltip
).add_to(m)
colormap.add_to(m)
m
#### Hotel room map
hotel = merged[['neighbourhood','Hotel room mean', 'Hotel room count']]
hotel = pd.merge(left = hotel, right = geop, on = 'neighbourhood')
hotel.drop(hotel.columns[[3]], axis=1, inplace=True)
gdf = GeoDataFrame(hotel, crs = "EPSG:4326", geometry = home_apt.geometry)
colormap = branca.colormap.LinearColormap(
vmin = gdf['Hotel room mean'].quantile(0.0),
vmax = gdf['Hotel room mean'].quantile(1.0),
colors = ['darkgreen','green','lightblue','blue','yellow','orange','red'],
caption = "Airbnb Hotel room mean price by City Hall",
)
m = folium.Map(location = [19.451054, -99.125519],
default_zoom_start = 5,
tiles="Cartodb Positron")
tooltip = GeoJsonTooltip(
fields = ["neighbourhood","Hotel room mean", "Hotel room count"],
aliases = ["City Hall:", "Mean Price:", "Total Accommodations:"],
localize = True,
sticky = False,
labels = True,
style = """
background-color: #F0EFEF;
border: 2px solid black;
border-radius: 3px;
box-shadow: 3px;
""",
max_width=800,
)
folium.GeoJson(
gdf,
style_function=lambda x: {
"fillColor": colormap(x["properties"]["Hotel room mean"])
if x["properties"]["Hotel room mean"] is not None
else "transparent",
"color": "black",
"fillOpacity": 0.4,
},
tooltip = tooltip
).add_to(m)
colormap.add_to(m)
m
#### Private room map
room = merged[['neighbourhood','Private room mean', 'Private room count']]
room = pd.merge(left = room, right = geop, on = 'neighbourhood')
room.drop(room.columns[[3]], axis=1, inplace=True)
gdf = GeoDataFrame(room, crs = "EPSG:4326", geometry = home_apt.geometry)
colormap = branca.colormap.LinearColormap(
vmin = gdf['Private room mean'].quantile(0.0),
vmax = gdf['Private room mean'].quantile(1.0),
colors = ['darkgreen','green','lightblue','blue','yellow','orange','red'],
caption = "Airbnb Private room mean price by City Hall",
)
m = folium.Map(location = [19.451054, -99.125519],
default_zoom_start = 5,
tiles="Cartodb Positron")
tooltip = GeoJsonTooltip(
fields = ["neighbourhood","Private room mean", "Private room count"],
aliases = ["City Hall:", "Mean Price:", "Total Accommodations:"],
localize = True,
sticky = False,
labels = True,
style = """
background-color: #F0EFEF;
border: 2px solid black;
border-radius: 3px;
box-shadow: 3px;
""",
max_width=800,
)
folium.GeoJson(
gdf,
style_function=lambda x: {
"fillColor": colormap(x["properties"]["Private room mean"])
if x["properties"]["Private room mean"] is not None
else "transparent",
"color": "black",
"fillOpacity": 0.4,
},
tooltip = tooltip
).add_to(m)
colormap.add_to(m)
m
#### Shared room map
shared = merged[['neighbourhood','Shared room mean', 'Shared room count']]
shared = pd.merge(left = shared, right = geop, on = 'neighbourhood')
shared.drop(room.columns[[3]], axis=1, inplace=True)
gdf = GeoDataFrame(shared, crs = "EPSG:4326", geometry = home_apt.geometry)
colormap = branca.colormap.LinearColormap(
vmin = gdf['Shared room mean'].quantile(0.0),
vmax = gdf['Shared room mean'].quantile(1.0),
colors = ['darkgreen','green','lightblue','blue','yellow','orange','red'],
caption = "Airbnb Shared room mean price by City Hall",
)
m = folium.Map(location = [19.451054, -99.125519],
default_zoom_start = 5,
tiles="Cartodb Positron")
tooltip = GeoJsonTooltip(
fields = ["neighbourhood","Shared room mean", "Shared room count"],
aliases = ["City Hall:", "Mean Price:", "Total Accommodations:"],
localize = True,
sticky = False,
labels = True,
style = """
background-color: #F0EFEF;
border: 2px solid black;
border-radius: 3px;
box-shadow: 3px;
""",
max_width=800,
)
folium.GeoJson(
gdf,
style_function=lambda x: {
"fillColor": colormap(x["properties"]["Shared room mean"])
if x["properties"]["Shared room mean"] is not None
else "transparent",
"color": "black",
"fillOpacity": 0.4,
},
tooltip = tooltip
).add_to(m)
colormap.add_to(m)
m
### dataframe grouped by City Hall and room type with index as City Hall
tops = df.groupby(['neighbourhood_cleansed','room_type'])['price'].mean().unstack(1)
tops.idxmax(0)
tops.idxmin()
As we can see, Miguel Hidalgo City Hall leads in both expensive room types categories. on the other side, Azcapotzalco leads in two cheapper room types categories. Looking at the map, these two City Halls are side by side one from the other and both are close to Mexico city downtown. So based only on the mean price by room type, we would recommend to some foreign visitor to reserve an Airbnb accommodation in Azcapotzalco City Hall. See Heat map below.
plt.title("Room types mean price by City Hall", fontsize = 13)
sns.heatmap(tops, square=True, cbar_kws={'fraction' : 0.3}, cmap='OrRd', linewidth=1);
Next we present the accommodations offered by City Hall and room type
counts = df.groupby(['neighbourhood_cleansed','room_type'])['id'].count().unstack(1)
counts.idxmax(0)
counts.idxmin(0)
plt.title("Room types accomodations by City Hall", fontsize = 13)
sns.heatmap(counts, square=True, cbar_kws={'fraction' : 0.3}, cmap='OrRd', linewidth=1);